import requests
from bs4 import BeautifulSoup
from time import sleep
import random
import os
import re

# spider csdn articles
def spider_csdn(save_art_path, all_counts = 50):
    url = r'https://www.csdn.net'
    main_html = requests.get(url)   # request the main_page
    # print(main_html.text)
    main_soup = BeautifulSoup(main_html.content, 'html.parser')
    main_tagA = main_soup.select('#feedlist_id > li > div > div.title > h2 > a')
    main_hrefs = [tag['href'] for tag in main_tagA]
    # print('main_hrefs: ', main_hrefs)

    count = 0
    try:
        # spider every article
        for url in main_hrefs:
            count += 1  # 计数

            art_req = requests.get(url)     # request one article
            art_soup = BeautifulSoup(art_req.content, 'html.parser')

            art_content = art_soup.select_one('#mainBox > main > div.blog-content-box').text
            art_title = art_soup.select_one('#mainBox > main > div.blog-content-box > div > div > div.article-title-box > h1').text.split()
            print('art_content: ', art_content)
            print('art_title: ', art_title)

            # handle art_content
            art_text = [line.strip() for line in art_content.split('\n')]       # handle strip()
            art_text = [re.sub('(\t+)', '', line) for line in art_text if line != '']   # handle \t
            print('art_text: ', art_text)
            content = ' '.join(art_text)
            print('content: ', content)


            with open(os.path.join(save_art_path, art_title[0]), 'w+', encoding='utf-8') as fw:
                fw.write(content)

            if count >= all_counts:     # judge the counts, keep the count <= all_counts
                return

            sleep(random.uniform(0, 1))
    except Exception as e:
        spider_csdn(save_art_path, all_counts)  # itera the method
        sleep(random.uniform(0, 1))


if __name__ == '__main__':
    save_art_path = r'E:\NLP1\骚操作\自学分词\csdn语料库训练\csdn_spider_datas'
    spider_csdn(save_art_path, 3)